Kim Hai's Mini Project 1: Exploration of Austin, Texas data from bikeshare and crime datasets¶
Import necessary Modules and Libraries¶
# Import necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
from datetime import datetime, date, time
import folium
import random
from datetime import timedelta
from google.cloud import bigquery
from google.cloud import storage
from google.cloud import bigquery_storage
%matplotlib inline
# Connect to the API and get the data
# Get the key to the Google Cloud platform from the secrets document
key_path = r'resolute-mote-462623-v5-c4c05ed61421.json' #: this json file contains the gcp BigQuery Secrets on user's local machine
Google Cloud Setup Code Chunk¶
# Google Cloud setup
storage_client = storage.Client.from_service_account_json(key_path)
client = bigquery.Client.from_service_account_json(key_path)
print(client.project) # check if this is the correct project
client = bigquery.Client.from_service_account_json(key_path, project = 'bigquery-public-data') # client to access the opensource data
#client = bigquery.Client.from_service_account_json(key_path, project = 'resolute-mote-462623-v5') # <- personal project code with 'owner' role
print(client.project)
resolute-mote-462623-v5 bigquery-public-data
# Check if austin_bikeshare and austin_crime dataset is in the list of datasets retrieved:
def printDatasetList(client):
project = client.project #: only one project can be associated with a client instance
datasets = list(client.list_datasets())
if datasets:
print('Datasets in project {}:'.format(project))
for dataset in datasets:
print('\t{}'.format(dataset.dataset_id))
found = True
else:
print('{} project does not contain any datasets.'.format(project))
found = False
return found
# list datasets in the default project:
#flag = printDatasetList(client) #: assigning to `flag` suppresses printing the return value (normally `True`)
austin_bikeshare_id='austin_bikeshare'
austin_crime_id='austin_crime'
austin_bikeshare_ref= client.dataset(austin_bikeshare_id, project = 'bigquery-public-data')
austin_crime_ref= client.dataset(austin_crime_id, project = 'bigquery-public-data')
bigquery-public-data.austin_bikeshare?
trips and stations +
austincrime?
https://console.cloud.google.com/bigquery?inv=1&invt=Abzz5w&project=resolute-mote-462623-v5
# Do some EDA
printTableList Helper function (Lifted from Lab 3.2.3)¶
# Check if the dataset has multiple tables
# function for listing tables in a dataset:
def printTableList(client, dataset_id):
project = client.project
dataset_ref = client.dataset(dataset_id, project = project)
tables = list(client.list_tables(dataset_ref))
if tables:
print('Tables in dataset {}:'.format(dataset_id))
for table in tables:
print('\t{}'.format(table.table_id))
found = True
else:
print('{} dataset does not contain any tables.'.format(dataset_id))
found = False
return found
#Bikeshare
printTableList(client, austin_bikeshare_id) # returns 2 tables: bikeshare stations and bikeshare trips
Tables in dataset austin_bikeshare: bikeshare_stations bikeshare_trips
True
# Crime
printTableList(client, austin_crime_id) # returns 1 table: crime
Tables in dataset austin_crime: crime
True
#Extract the respective tables with the API call:
# Bikeshare dataset (2 tables)
stations_table_id = 'bikeshare_stations'
trips_table_id = 'bikeshare_trips'
stations_table_ref = austin_bikeshare_ref.table(stations_table_id)
trips_table_ref = austin_bikeshare_ref.table(trips_table_id)
stations_table = client.get_table(stations_table_ref) # API Request
trips_table = client.get_table(trips_table_ref) # API Request
# Crime dataset:
crime_table_id = 'crime'
crime_table_ref = austin_crime_ref.table(crime_table_id)
crime_table = client.get_table(crime_table_ref) # API Request
printTableSchema Help function (Lifted from Lab 3.2.3)¶
def printTableSchema(aTable):
schemas = list(aTable.schema)
if schemas:
print('Table schema for {}:'.format(aTable.table_id))
for aSchema in schemas:
print('\t{0} {1}'.format(aSchema.name, aSchema.field_type))
found = True
else:
found = False
return found
Print the table schema so we will know how to index the columns of interest in the table¶
#Stations table schema:
printTableSchema(client.get_table(stations_table_ref))
Table schema for bikeshare_stations: station_id INTEGER name STRING status STRING location STRING address STRING alternate_name STRING city_asset_number INTEGER property_type STRING number_of_docks INTEGER power_type STRING footprint_length INTEGER footprint_width FLOAT notes STRING council_district INTEGER image STRING modified_date TIMESTAMP
True
#Trips table schema:
printTableSchema(client.get_table(trips_table_ref))
Table schema for bikeshare_trips: trip_id STRING subscriber_type STRING bike_id STRING bike_type STRING start_time TIMESTAMP start_station_id INTEGER start_station_name STRING end_station_id STRING end_station_name STRING duration_minutes INTEGER
True
# Crime table schema:
printTableSchema(client.get_table(crime_table_ref))
Table schema for crime: unique_key INTEGER address STRING census_tract FLOAT clearance_date TIMESTAMP clearance_status STRING council_district_code INTEGER description STRING district STRING latitude FLOAT longitude FLOAT location STRING location_description STRING primary_type STRING timestamp TIMESTAMP x_coordinate INTEGER y_coordinate INTEGER year INTEGER zipcode STRING
True
SQL queries on public data with personal project credentials¶
# Get all of stations table:
stations_all_sql = "SELECT * FROM bigquery-public-data.austin_bikeshare.bikeshare_stations"
client = bigquery.Client.from_service_account_json(key_path, project = 'resolute-mote-462623-v5') # <- personal project code with 'owner' role
query_job = client.query(stations_all_sql)
stations_all_df = query_job.to_dataframe()
if len(stations_all_df) ==0:
# Load from csv instead
stations_all_df=pd.read_csv(r"C:\Users\koh_k\Downloads\Institute of Data\Labs\Module 3\Labs 3\google-bigQuery-austin-bikeshare-stations-table-all.csv")
#stations_head_df
# Get all of trips table:
trips_all_sql = "SELECT * FROM bigquery-public-data.austin_bikeshare.bikeshare_trips"
client = bigquery.Client.from_service_account_json(key_path, project = 'resolute-mote-462623-v5') # <- personal project code with 'owner' role
query_job = client.query(trips_all_sql)
trips_all_df = query_job.to_dataframe()
# Get all of crimes table:
crimes_all_sql = "SELECT * FROM bigquery-public-data.austin_crime.crime"
client = bigquery.Client.from_service_account_json(key_path, project = 'resolute-mote-462623-v5') # <- personal project code with 'owner' role
query_job = client.query(crimes_all_sql)
crimes_all_df = query_job.to_dataframe()
# If the dataframes have not been saved as csv, create a new csv and write the dataframe to it, else, it should not create the csv file again
try:
stations_all_df.to_csv('google-bigQuery-austin-bikeshare-stations-table-all.csv',mode='x')
except FileExistsError:
print('stations table already saved in .csv file in this location!')
except Exception as e:
print(f'caught {type(e)}: e')
stations table already saved in .csv file in this location!
# If the dataframes have not been saved as csv, create a new csv and write the dataframe to it, else, it should not create the csv file again
try:
trips_all_df.to_csv('google-bigQuery-austin-bikeshare-trips-table-all.csv',mode='x')
except FileExistsError:
print('trips table already saved in .csv file in this location!')
except Exception as e:
print(f'caught {type(e)}: e')
trips table already saved in .csv file in this location!
# If the dataframes have not been saved as csv, create a new csv and write the dataframe to it, else, it should not create the csv file again
try:
crimes_all_df.to_csv('google-bigQuery-austin-crime-crime-table-all.csv',mode='x')
except FileExistsError:
print('crime table already saved in .csv file in this location!')
except Exception as e:
print(f'caught {type(e)}: e')
crime table already saved in .csv file in this location!
EDA of table(s)¶
austin_bikeshare.stations EDA¶
# Inspect the data
stations_all_df.head()
| station_id | name | status | location | address | alternate_name | city_asset_number | property_type | number_of_docks | power_type | footprint_length | footprint_width | notes | council_district | image | modified_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001 | OFFICE/Main/Shop/Repair | closed | (30.27186, -97.73997) | 1000 Brazos | None | <NA> | None | <NA> | None | <NA> | NaN | This is the testing dock/station located at BS... | 1 | None | 2022-03-04 09:58:00+00:00 |
| 1 | 1002 | 6th & Navasota St. | closed | (30.26383, -97.72864) | 1308 W. 6th St. | None | <NA> | None | <NA> | None | <NA> | NaN | None | 3 | None | 2021-01-04 00:00:00+00:00 |
| 2 | 1003 | 8th & Guadalupe | closed | (30.27106, -97.74563) | 800 Guadalupe St. | None | <NA> | None | <NA> | None | <NA> | NaN | None | 9 | None | 2021-01-04 00:00:00+00:00 |
| 3 | 1004 | Red River & LBJ Library | closed | (30.2848, -97.72756) | 2322 Red River Street | None | <NA> | None | <NA> | None | <NA> | NaN | None | 1 | None | 2021-01-04 00:00:00+00:00 |
| 4 | 2576 | Rainey @ River St | closed | (30.25802, -97.7391) | 64 Rainey St | None | <NA> | None | <NA> | None | <NA> | NaN | None | 9 | None | 2021-01-04 00:00:00+00:00 |
Comment: We can see that there are a lot of null values in this table just within the first 5 rows, let's understand the table better with .info()¶
stations_all_df.info()
# 101 entries,0 to 100 rows
# Columns 5 alternate name 99 NULL values
# " 6 city asset number 24 NULL values
# " 7 property type 20 NULL values
# " 8 number of docks 20 NULL values
# " 9 power_type 20 NULL values
# " 10 footprint length 22 NULL values
# " 11 foorprint width 22 NULL values
# " 13 notes 70 NULL values
# " 14 image 101 NULL values -> drop
<class 'pandas.core.frame.DataFrame'> RangeIndex: 101 entries, 0 to 100 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 station_id 101 non-null Int64 1 name 101 non-null object 2 status 101 non-null object 3 location 101 non-null object 4 address 100 non-null object 5 alternate_name 2 non-null object 6 city_asset_number 77 non-null Int64 7 property_type 81 non-null object 8 number_of_docks 81 non-null Int64 9 power_type 81 non-null object 10 footprint_length 79 non-null Int64 11 footprint_width 79 non-null float64 12 notes 31 non-null object 13 council_district 101 non-null Int64 14 image 0 non-null object 15 modified_date 101 non-null datetime64[us, UTC] dtypes: Int64(5), datetime64[us, UTC](1), float64(1), object(9) memory usage: 13.2+ KB
stations_all_df.describe()
| station_id | city_asset_number | number_of_docks | footprint_length | footprint_width | council_district | |
|---|---|---|---|---|---|---|
| count | 101.0 | 77.0 | 81.0 | 79.0 | 79.000000 | 101.0 |
| mean | 2914.316832 | 22933.402597 | 13.580247 | 38.810127 | 5.512658 | 6.574257 |
| std | 969.304833 | 7814.504916 | 2.827827 | 8.498983 | 1.527472 | 3.302564 |
| min | 0.0 | 16597.0 | 9.0 | 15.0 | 5.000000 | 1.0 |
| 25% | 2539.0 | 16738.0 | 12.0 | 35.0 | 5.000000 | 3.0 |
| 50% | 2575.0 | 16756.0 | 13.0 | 40.0 | 5.000000 | 9.0 |
| 75% | 3790.0 | 32670.0 | 15.0 | 40.0 | 5.000000 | 9.0 |
| max | 4879.0 | 32746.0 | 22.0 | 55.0 | 15.000000 | 10.0 |
only footprint length and width are meaningful columns for this summary statistics¶
# Drop image column
stations_all_df.drop(['image'],axis=1,inplace=True)
Stations table data interrogation¶
Value counts of columns in stations table¶
stations_all_df['alternate_name'].value_counts() # not a very interesting column to investigate
alternate_name Zilker Park at Barton Springs and William Burton Drive 1 Congress & 6th Street 1 Name: count, dtype: int64
stations_all_df['property_type'].value_counts()
property_type sidewalk 28 undetermined_parking 20 paid_parking 19 nonmetered_parking 8 parkland 6 Name: count, dtype: int64
stations_all_df['footprint_length'].value_counts() # could be used for visualisation
footprint_length 40 38 35 8 30 8 45 7 55 7 20 3 50 3 25 3 26 1 15 1 Name: count, dtype: Int64
stations_all_df['footprint_width'].value_counts()
footprint_width 5.0 68 7.5 7 10.0 2 15.0 1 8.0 1 Name: count, dtype: int64
stations_all_df['council_district'].value_counts() # will need to cross reference with 3rd party information to understand this
council_district 9 59 1 16 3 16 5 5 8 3 10 2 Name: count, dtype: Int64
#stations_all_df
austin_bikeshare.trips EDA¶
trips_all_df.head() # over 2million rows! limit prints and tests
| trip_id | subscriber_type | bike_id | bike_type | start_time | start_station_id | start_station_name | end_station_id | end_station_name | duration_minutes | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 27528286 | Single Trip (Pay-as-you-ride) | 21422 | electric | 2022-08-20 14:03:38+00:00 | 4051 | 10th/Red River | 4051 | 10th/Red River | 102 |
| 1 | 28953173 | Student Membership | 21821 | electric | 2023-02-26 02:33:47+00:00 | 4051 | 10th/Red River | 4051 | 10th/Red River | 2 |
| 2 | 28833031 | Explorer | 18181 | electric | 2023-02-04 17:52:20+00:00 | 4051 | 10th/Red River | 4051 | 10th/Red River | 4 |
| 3 | 28181257 | Local31 | 829 | classic | 2022-10-13 14:42:31+00:00 | 4051 | 10th/Red River | 4051 | 10th/Red River | 13 |
| 4 | 28344994 | Explorer | 996 | classic | 2022-10-29 16:01:27+00:00 | 4051 | 10th/Red River | 4051 | 10th/Red River | 129 |
trips_all_df.info() # over 2million rows! limit prints and tests
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2271152 entries, 0 to 2271151 Data columns (total 10 columns): # Column Dtype --- ------ ----- 0 trip_id object 1 subscriber_type object 2 bike_id object 3 bike_type object 4 start_time datetime64[us, UTC] 5 start_station_id Int64 6 start_station_name object 7 end_station_id object 8 end_station_name object 9 duration_minutes Int64 dtypes: Int64(2), datetime64[us, UTC](1), object(7) memory usage: 177.6+ MB
trips_all_df.describe()
| start_station_id | duration_minutes | |
|---|---|---|
| count | 2266705.0 | 2271152.0 |
| mean | 3252.173349 | 28.719178 |
| std | 1033.208271 | 125.316666 |
| min | 1001.0 | 2.0 |
| 25% | 2547.0 | 6.0 |
| 50% | 2707.0 | 11.0 |
| 75% | 3797.0 | 27.0 |
| max | 7637.0 | 34238.0 |
Value counts of columns in trips table¶
trips_all_df['duration_minutes'].values
<IntegerArray> [102, 2, 4, 13, 129, 106, 20, 32, 40, 3, ... 53, 30, 149, 221, 2, 2, 32, 34, 26, 27] Length: 2271152, dtype: Int64
np.sort(trips_all_df['duration_minutes'].value_counts())
array([ 1, 1, 1, ..., 151142, 167513, 171706], dtype=int64)
upToSixtyMins_idx= np.where(trips_all_df['duration_minutes']<=60.0)
overSixtyMins_idx= np.where(trips_all_df['duration_minutes']>60.0)
upToSixtyMins=trips_all_df[['duration_minutes']].iloc[upToSixtyMins_idx]
overSixtyMins=trips_all_df[['duration_minutes']].iloc[overSixtyMins_idx]
Percentage of upToSixtyMins¶
print(len(upToSixtyMins)/len(trips_all_df['duration_minutes']))
0.9250789907500687
Visual Distribution of upToSixtyMins¶
plt.figure()
plt.suptitle('Understanding distribution of duration (minutes)')
ax1=plt.subplot(121)
ax1.set_ylim([0 ,1.80e5])
#ax1.set_xlim([0,30])
plt.hist(upToSixtyMins,bins=60,edgecolor="black")
plt.title('Rides under 1Hr (60 Bins)')
ax2=plt.subplot(122)
ax2.set_ylim([0 ,1.80e5])
ax2.set_xlim([31,1300])
plt.hist(overSixtyMins,bins=600,edgecolor="black")
plt.title('Rides over 1Hr (600 Bins)')
plt.show()
trips_all_df['bike_type'].value_counts()
bike_type classic 1347722 electric 923430 Name: count, dtype: int64
trips_all_df['start_station_name'].value_counts()
start_station_name
21st/Speedway @ PCL 108559
21st & Speedway @PCL 71145
Dean Keeton/Speedway 65066
Zilker Park 49294
26th/Nueces 44179
...
Eeyore's 2018 2
Eeyore's 2017 1
Mobile Station 1
Stolen 1
cesar Chavez/Congress 1
Name: count, Length: 201, dtype: int64
trips_all_df['subscriber_type'].value_counts() # very rich column, but may require a deep understanding of the business operations?
subscriber_type Student Membership 430347 Local365 389728 U.T. Student Membership 289108 24 Hour Walk Up Pass 272860 Walk Up 260044 Local31 162167 Explorer 120252 Pay-as-you-ride 82745 Local30 55777 3-Day Weekender 51778 Weekender 40500 Single Trip (Pay-as-you-ride) 35320 Local365+Guest Pass 14000 Single Trip 11709 Founding Member 6130 3-Day Explorer 6016 7-Day 5684 Single Trip 5122 Local365 ($80 plus tax) 3886 Semester Membership 3231 Single Trip Ride 3083 Local365- 1/2 off Anniversary Special 2907 HT Ram Membership 2686 Annual 2208 Annual Membership 1307 Local30 ($11 plus tax) 820 $1 Pay by Trip Winter Special 764 Local365 Youth (age 13-17 riders) 676 Annual 639 Try Before You Buy Special 595 Weekender ($15 plus tax) 550 $1 Pay by Trip Fall Special 466 Local365+Guest Pass- 1/2 off Anniversary Special 466 Annual Member 413 ACL Weekend Pass Special 319 Republic Rider (Annual) 293 Annual Pass 236 ACL 2019 Pass 207 Explorer ($8 plus tax) 182 Annual Plus 172 RideScout Single Tide 109 Aluminum Access 96 Local365 Youth with helmet (age 13-17 riders) 91 Madtown Monthly 63 FunFunFun Fest 3 Day Pass 27 Annual Pass (Original) 27 Annual Plus Membership 26 Republic Rider 16 Membership: pay once one-year commitment 13 Denver B-cycle Founder 12 Heartland Pass (Annual Pay) 12 UT Student Membership 9 Annual Membership 7 RideScout Single Ride 4 Local365 Youth (age 13-17 riders)- 1/2 off Special 3 Membership: pay once, one-year commitment 2 Annual Pass (30 minute) 2 Heartland Pass (Monthly Pay) 1 RESTRICTED 1 24-Hour Membership 1 Name: count, dtype: int64
# Trips from Dec-13 to Jun-24 in datetime object
np.sort(trips_all_df['start_time'].values) #
array(['2013-12-12T16:48:46.000000', '2013-12-13T09:33:17.000000',
'2013-12-13T09:39:38.000000', ..., '2024-06-30T23:26:20.000000',
'2024-06-30T23:42:44.000000', '2024-06-30T23:44:03.000000'],
dtype='datetime64[us]')
trips_all_df['bike_id'].value_counts()
bike_id
18947 3994
19330 3970
19403 3943
19271 3918
19677 3876
...
Block03 2
Block 12 2
Block04 2
Block 25 1
Block05 1
Name: count, Length: 1024, dtype: int64
Trips table data exploration summary:¶
Majority of the data is under an hour, made the executive devision to drop long bike rides for initial assessment as it is bound to contain extreme outliers
What is the percentage of data kept vs left out?
This was found earlier:
print(len(upToSixtyMins)/len(trips_all_df['duration_minutes']))=0.9250789907500687
Based on subscriber type, we can see some commuters would be incentivised to maximise ride duration as they only have a limited time to access the bikes others may be biased to take multiple short opportunistic trips as they have long-term passes (monthly/ annual)
austin_crime.crime EDA¶
crimes_all_df.info()
# There is a lot of nulled data
# Column 0 unique_key has 1 null
# Column 1 address has 76,032 null
# Column 2 census_tract has 523 null
# Column 3 clearance_date has 5,114 null
# Column 4 clearance_status has 5,114 null
# Column 5 council_district_code has 675 null
# Column 8 latitude has 82,172 null
# Column 9 longitude has 82,172 null
# Column 14 x_coordinate has 2147 null
# Column 15 y_coordinate has 2147 null
# Column 17 zipcode has 523 null
<class 'pandas.core.frame.DataFrame'> RangeIndex: 116672 entries, 0 to 116671 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 unique_key 116671 non-null Int64 1 address 40640 non-null object 2 census_tract 116149 non-null float64 3 clearance_date 111558 non-null datetime64[us, UTC] 4 clearance_status 111558 non-null object 5 council_district_code 115997 non-null Int64 6 description 116672 non-null object 7 district 116672 non-null object 8 latitude 34500 non-null float64 9 longitude 34500 non-null float64 10 location 116672 non-null object 11 location_description 116672 non-null object 12 primary_type 116672 non-null object 13 timestamp 116672 non-null datetime64[us, UTC] 14 x_coordinate 114525 non-null Int64 15 y_coordinate 114525 non-null Int64 16 year 116672 non-null Int64 17 zipcode 116149 non-null object dtypes: Int64(5), datetime64[us, UTC](2), float64(3), object(8) memory usage: 16.6+ MB
crimes_all_df.iloc[np.where(crimes_all_df['x_coordinate'].notnull())]
# Checking where x coord is not null, there could be rows where missing lat/long data could be imputed?
# reveals that primary_type column refers to the primary charge of the unique incident at the address
| unique_key | address | census_tract | clearance_date | clearance_status | council_district_code | description | district | latitude | longitude | location | location_description | primary_type | timestamp | x_coordinate | y_coordinate | year | zipcode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 135 | 20155045678 | None | NaN | 2015-10-19 00:00:00+00:00 | Not cleared | <NA> | THEFT | F | NaN | NaN | (0 \n1 \n2 \n3 ... | 8704 MERIDIAN OAK LN | Theft | 2015-10-15 00:00:00+00:00 | 3144494 | 10031451 | 2015 | None |
| 136 | 20153101915 | None | NaN | 2016-01-05 00:00:00+00:00 | Cleared by Exception | <NA> | AGG ROBBERY/DEADLY WEAPON | F | NaN | NaN | (0 \n1 \n2 \n3 ... | E RIVERSIDE | Robbery | 2015-11-06 00:00:00+00:00 | 3113492 | 10038801 | 2015 | None |
| 137 | 20155053782 | None | NaN | 2015-12-10 00:00:00+00:00 | Not cleared | <NA> | BURGLARY OF VEHICLE | F | NaN | NaN | (0 \n1 \n2 \n3 ... | 212 COLLINGWOOD DR | Theft | 2015-12-09 00:00:00+00:00 | 3091347 | 10039561 | 2015 | None |
| 138 | 20155036648 | None | NaN | 2015-09-02 00:00:00+00:00 | Not cleared | <NA> | BURGLARY OF VEHICLE | H | NaN | NaN | (0 \n1 \n2 \n3 ... | 505 TERRACE DR | Theft | 2015-08-20 00:00:00+00:00 | 3040396 | 10039976 | 2015 | None |
| 139 | 20153440622 | None | NaN | 2015-12-19 00:00:00+00:00 | Not cleared | <NA> | AUTO THEFT | F | NaN | NaN | (0 \n1 \n2 \n3 ... | 7707 S IH35 | Auto Theft | 2015-12-10 00:00:00+00:00 | 3118720 | 10042867 | 2015 | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 116667 | 20163590195 | None | 9800.0 | NaT | None | 2 | AGG ROBBERY/DEADLY WEAPON | H | NaN | NaN | (0 \n1 \n2 \n3 ... | 2406 S US 183 HWY NB | Robbery | 2016-12-24 00:00:00+00:00 | 3133568 | 10051238 | 2016 | 78719 |
| 116668 | 20143150639 | 9400 FREIGHT LN Austin, TX 78719 (30.213588, -... | 9800.0 | 2015-02-10 00:00:00+00:00 | Not cleared | 2 | THEFT | AP | 30.213588 | -97.669473 | (0 \n1 \n2 ... | 9400 FREIGHT LN | Theft: All Other Larceny | 2014-11-11 00:00:00+00:00 | 3137505 | 10051853 | 2014 | 78719 |
| 116669 | 2014851498 | 1800 BLOCK E SH 71 EB Austin, TX 78719 | 9800.0 | 2014-07-29 00:00:00+00:00 | Cleared by Exception | 2 | DEADLY CONDUCT | H | NaN | NaN | (0 \n1 \n2 ... | 1800 BLOCK E SH 71 EB | Aggravated Assault | 2014-03-26 00:00:00+00:00 | 3137185 | 10053684 | 2014 | 78719 |
| 116670 | 2016180312 | None | 9800.0 | 2016-01-21 00:00:00+00:00 | Not cleared | 2 | BURGLARY NON RESIDENCE | H | NaN | NaN | (0 \n1 \n2 \n3 ... | 1500 E SH 71 EB | Burglary | 2016-01-18 00:00:00+00:00 | 3135207 | 10054563 | 2016 | 78719 |
| 116671 | 20142500470 | 1500 E SH 71 EB Austin, TX 78719 | 9800.0 | 2014-09-10 00:00:00+00:00 | Not cleared | 2 | THEFT | H | NaN | NaN | (0 \n1 \n2 ... | 1500 E SH 71 EB | Theft: All Other Larceny | 2014-09-07 00:00:00+00:00 | 3135207 | 10054564 | 2014 | 78719 |
114525 rows × 18 columns
crimes_all_df['location'].values
# Seems to be some kind of JSON object?
array(['(0 \n1 \n2 \n3 \n4 \n ..\n38567 \n38568 \n38569 \n38570 \n38571 \nName: latitude, Length: 38572, dtype: object,0 \n1 \n2 \n3 \n4 \n ..\n38567 \n38568 \n38569 \n38570 \n38571 \nName: longitude, Length: 38572, dtype: object)',
'(0 \n1 \n2 \n3 \n4 \n ..\n38567 \n38568 \n38569 \n38570 \n38571 \nName: latitude, Length: 38572, dtype: object,0 \n1 \n2 \n3 \n4 \n ..\n38567 \n38568 \n38569 \n38570 \n38571 \nName: longitude, Length: 38572, dtype: object)',
'(0 \n1 \n2 \n3 \n4 \n ..\n38567 \n38568 \n38569 \n38570 \n38571 \nName: latitude, Length: 38572, dtype: object,0 \n1 \n2 \n3 \n4 \n ..\n38567 \n38568 \n38569 \n38570 \n38571 \nName: longitude, Length: 38572, dtype: object)',
...,
'(0 \n1 \n2 \n3 \n4 \n ... \n40635 30.264271\n40636 30.221486\n40637 30.233661\n40638 30.31604\n40639 30.293056\nName: latitude, Length: 40640, dtype: object,0 \n1 \n2 \n3 \n4 \n ... \n40635 -97.71608\n40636 -97.690245\n40637 -97.790092\n40638 -97.668431\n40639 -97.735881\nName: longitude, Length: 40640, dtype: object)',
'(0 \n1 \n2 \n3 \n4 \n ..\n37455 \n37456 \n37457 \n37458 \n37459 \nName: latitude, Length: 37460, dtype: object,0 \n1 \n2 \n3 \n4 \n ..\n37455 \n37456 \n37457 \n37458 \n37459 \nName: longitude, Length: 37460, dtype: object)',
'(0 \n1 \n2 \n3 \n4 \n ... \n40635 30.264271\n40636 30.221486\n40637 30.233661\n40638 30.31604\n40639 30.293056\nName: latitude, Length: 40640, dtype: object,0 \n1 \n2 \n3 \n4 \n ... \n40635 -97.71608\n40636 -97.690245\n40637 -97.790092\n40638 -97.668431\n40639 -97.735881\nName: longitude, Length: 40640, dtype: object)'],
dtype=object)
Making sure that all pairs of latitude and longitude are either nulled or non-nulled and not either/ or¶
print(crimes_all_df['latitude'].isnull())
print(crimes_all_df['longitude'][crimes_all_df['latitude'].isnull()].info())
print(crimes_all_df['latitude'][crimes_all_df['longitude'].isnull()].info())
# Check if any null values of lat contains any non null value of long and vice-versa
0 True
1 True
2 True
3 True
4 True
...
116667 True
116668 False
116669 True
116670 True
116671 True
Name: latitude, Length: 116672, dtype: bool
<class 'pandas.core.series.Series'>
Index: 82172 entries, 0 to 116671
Series name: longitude
Non-Null Count Dtype
-------------- -----
0 non-null float64
dtypes: float64(1)
memory usage: 1.3 MB
None
<class 'pandas.core.series.Series'>
Index: 82172 entries, 0 to 116671
Series name: latitude
Non-Null Count Dtype
-------------- -----
0 non-null float64
dtypes: float64(1)
memory usage: 1.3 MB
None
crimes_all_df['council_district_code'].value_counts()
council_district_code 3 18109 9 17662 4 16656 7 13439 1 11900 2 10397 5 10130 8 6226 6 6223 10 5255 Name: count, dtype: Int64
stations_all_df['council_district'].value_counts()
council_district 9 59 1 16 3 16 5 5 8 3 10 2 Name: count, dtype: Int64
NA district data from crimes table¶
#X-ref this with station data
crime_with_no_stns= list(set(crimes_all_df['council_district_code'])-set(stations_all_df['council_district']))
crime_with_no_stns=[ x for x in crime_with_no_stns if type(x) == np.int64]
print(crime_with_no_stns)
# 675 nulled values from crimes data set, drop these first and if have time, come back to impute the district data for these (if available) locations/long/lat/x/y
[2, 4, 6, 7]
Comment:¶
*Comparing and Contrasting the council district data in the 2 tables reveal that some district have recorded crimes between 2014 to 2016 but do not have any stations recorded as installed in the district, this could be used for future analysis*
District 2,4,6,7 have crime data but no stations data
Planning District Document
https://services.austintexas.gov/edims/document.cfm?id=375177
District 2: White | Hispanic | Black | Asian
23.7| 63.7 | 9.3| 3.3
District 4: White | Hispanic | Black | Asian
25.6| 59.2 | 10.4| 4.6
District 6: White | Hispanic | Black | Asian
46.7| 16.6 | 7.1| 28.4
District 7: White | Hispanic | Black | Asian
54.2| 22.9 | 9.4| 12.3
No significant racial trend with crime on the surface, might be interesting to drill down on the primary crime grouped by districts ->DO WITH SQL/ pd DataFrames groupby?
Unique values of year column¶
crimes_all_df['year'].value_counts()
year 2014 40640 2015 38572 2016 37460 Name: count, dtype: Int64
# 116,672 non null data from 01/01/2014 to 31/12/2016 -> can drop station data from 01/01/2017 to 31/12/2024?
print(np.sort(crimes_all_df['timestamp']))
print(len(crimes_all_df['timestamp'].notnull()))
[Timestamp('2014-01-01 00:00:00+0000', tz='UTC')
Timestamp('2014-01-01 00:00:00+0000', tz='UTC')
Timestamp('2014-01-01 00:00:00+0000', tz='UTC') ...
Timestamp('2016-12-31 00:00:00+0000', tz='UTC')
Timestamp('2016-12-31 00:00:00+0000', tz='UTC')
Timestamp('2016-12-31 00:00:00+0000', tz='UTC')]
116672
Visualising Geospatial data with folium¶
GenerateBaseMap helper function (Lifted from Lab 2.1.2)¶
def generateBaseMap(default_location=[30.266666, -97.733330], default_zoom_start=14):
'''
Create a base map
'''
base_map = folium.Map(
location = default_location
, control_scale = True
, zoom_start = default_zoom_start
)
return base_map
Display austin_base_map¶
# Simple data summary
austin_base_map= generateBaseMap()
display(austin_base_map)
Importance of iterative coding and use of new tools (GPT)¶
Extract the JSON data from this url¶
import urllib, json austin_city_council_boundaries_url="https://services.arcgis.com/0L95CJ0VTaxqcmED/ArcGIS/rest/services/BOUNDARIES_single_member_districts/FeatureServer/0?f=pjson"
city_council_data=None with urllib.request.urlopen(austin_city_council_boundaries_url) as url: city_council_data = json.load(url)
data is null for the polygons we are interested in¶
https://realpython.com/python-folium-web-maps-from-data/#add-a-geojson-countries-layer
https://austin.maps.arcgis.com/home/item.html?id=8251869f87e24d3abff3d7a36fc7c4f4
[out:json]; area["name"="Austin"]["admin_level"="8"]; (way["boundary"="administrative"]["name"~"District"] (area); relation["boundary"="administrative"]["name"~"District"] (area);); out body geom;
Add boundary layer to basemap¶
city_council_data=None
with open(r"C:\Users\koh_k\Downloads\Institute of Data\Labs\Module 3\Labs 3\Single Member Council Districts_20250619.geojson", encoding="utf-8") as f:
city_council_data = json.load(f)
Data Source:¶
https://data.austintexas.gov/dataset/Single-Member-Council-Districts/8spj-utxs/data_preview
re-directed from ChatGPT
# GPT generated code:
# Get 10 shades from the "Blues" colormap
greens = plt.cm.Greens # This is a colormap object
green_shades = [greens(i) for i in range(50, 256, 21)] # Skip very light shades
# Convert RGBA to HEX
def rgba_to_hex(rgba):
return '#{:02x}{:02x}{:02x}'.format(
int(rgba[0]*255), int(rgba[1]*255), int(rgba[2]*255)
)
red_hex_colors = [rgba_to_hex(color) for color in green_shades[:10]]
for feature in city_council_data['features']:
feature['properties']['color'] = random.choice(red_hex_colors)
# GPT generated code, edited to fit in context
def style_function(feature):
return {
'fillColor': feature['properties'].get('color', 'blue'),
'color': 'black', # Border color
'weight': 1,
'fillOpacity': 0.6
}
# Add boundary layer to basemap and confirm visually
try:
folium.GeoJson(city_council_data,style_function=style_function).add_to(austin_base_map)
except Exception as e:
print(e)
# Visualise:
austin_base_map
Simple Visual analysis based on external sources about council district column¶
https://maps.austintexas.gov/GIS/CouncilDistrictMap/
Based on the map view on this link, it seems that these 6 districts are geographically proximate to each other. Running westerly (5,8,10) to easterly (9,1,3). The hole bounded by distict 8 and 10 belongs to private property? Davenport Ranch Westlake Highlands and West Lake Hills.
The hole in district 1 should be the WM Austin Community Landfill
District 7: Wells Branch/ Willow branch hole: private development?
District 6: Cat Hollow: Private development?
District 9 contains the state capitol, downtown core, University of Texas campus, also has highest number of stations, built where people who can use it?
Bike Station distibution based on google maps with the biking trails turned on, seems to be highly correlated with the biking trails: https://www.google.com/maps/@30.2998663,-97.8651785,11.75z/data=!5m1!1e3?entry=ttu&g_ep=EgoyMDI1MDYxMS4wIKXMDSoASAFQAw%3D%3D
Except for districts 4,7,6, the north west region
Consulting with OpenStreetmap: https://www.openstreetmap.org/#map=14/30.28635/-97.74613&layers=C
It seems that the cycle map layer could be accessed by folium, increasing the value of geospatial plotting of (incomplete) station details on the map for easier visualisation.
Adding Cycle Paths layer onto the map¶
cycle_path_data=[]
with open(r"cycle_path_data.geojson", encoding="utf-8") as f:
cycle_path_data = json.load(f)
folium.GeoJson(cycle_path_data).add_to(austin_base_map)
# Visualise:
austin_base_map
Comment:¶
*We can see that what is labelled as a bike trail is discontinuous and concentrated in certain regions (ie district 9 [near the university and CBD area] and district 8 [in neighbouhoods near a park])*
Add stations to map with Folium Markers¶
# get locations of stations from station table
# Location appears to be a tuple of lat,long [austin is 30.266666, -97.733330]
cleaned_stations_df=stations_all_df.iloc[np.where(stations_all_df['location'].notnull())]
# Can grab station_id, status, address, modified_date and place into the markerinfo
for stnIdx in range(0,len(cleaned_stations_df)):
MarkerInfo= pd.DataFrame(data=[[cleaned_stations_df.iloc[stnIdx]['station_id'],
cleaned_stations_df.iloc[stnIdx]['name'],
cleaned_stations_df.iloc[stnIdx]['address'],
cleaned_stations_df.iloc[stnIdx]['modified_date']]],columns=['Station ID','Station Name','Address','Status'])
htmlMarkerInfo=MarkerInfo.to_html()
#location is a string of diff sublength
stnLat,stnLong=cleaned_stations_df.iloc[stnIdx]['location'].split(',')
stnLat=float(stnLat.split('(')[1]) # Split the substring further and cast as a float
stnLong=float(stnLong.split(')')[0]) # Split the substring further and cast as a float
folium.Marker(location=[stnLat,stnLong],popup=folium.Popup(htmlMarkerInfo,max_width="500%")).add_to(austin_base_map)
# Display additions:
austin_base_map
Visualise crime data with political boundaries¶
# Generate new crime_district_map
crime_district_map= generateBaseMap(default_zoom_start=10)
# Add political boundaries
try:
folium.GeoJson(city_council_data,style_function=style_function).add_to(crime_district_map)
except Exception as e:
print(e)
from folium.plugins import MarkerCluster
# Place crime data into a marker and mark the proximity with a circle
cleaned_crimes_df=crimes_all_df[crimes_all_df['latitude'].notnull()]
#cleaned_crimes_df
# Initialize MarkerCluster
marker_cluster = MarkerCluster().add_to(crime_district_map)
# grab address, description, latitude, longitude, year
year_colours={'2014':'orange',
'2015':'pink',
'2016':'purple'}
for crimeIdx in range(0,len(cleaned_crimes_df)):
MarkerInfo= pd.DataFrame(data=[[cleaned_crimes_df.iloc[crimeIdx]['address'],
cleaned_crimes_df.iloc[crimeIdx]['description'],
cleaned_crimes_df.iloc[crimeIdx]['year']]],
columns=['Address','Description','Year'])
htmlMarkerInfo=MarkerInfo.to_html(index=False)
# Determine marker color based on year
color = year_colours.get(str(cleaned_crimes_df.iloc[crimeIdx]['year']), 'gray')
folium.Marker(location=[cleaned_crimes_df.iloc[crimeIdx]['latitude'],cleaned_crimes_df.iloc[crimeIdx]['longitude']],
popup=folium.Popup(htmlMarkerInfo,max_width="500%"),icon=folium.Icon(color=color)
).add_to(marker_cluster)
# display
crime_district_map.save("crime_district_map.html")
Open the crime_district_map.html document in a new browser tab¶
Cross-Referencing the stations data:¶
stations_all_df['council_district'].value_counts() returns:
| council | district |
|---|---|
| 9 | 59 |
| 1 | 16 |
| 3 | 16 |
| 5 | 5 |
| 8 | 3 |
| 10 | 2 |
count_of_crimes_in_district= cleaned_crimes_df.groupby("council_district_code",dropna=True)['primary_type'].count().sort_values(ascending=False)
print(count_of_crimes_in_district)
council_district_code 3 5859 9 4928 4 4909 7 3817 1 3693 2 3127 5 2820 8 1760 10 1758 6 1671 Name: primary_type, dtype: int64
Surface Level Analysis: count_of_crimes_in_district¶
We can say that in areas where bike stations have been installed, the more stations have been installed, there is a greater likelihood that the city has recorded a crime in that district between the years of 2014 to 2016.
5/6 districts adhere to this observed trend, except for District 3, which has the highest number of crimes but is only middle of the table for number of stations installed.
Perhaps we can investigate if the type of crimes affect the installation of stations?
top_crimes_per_district=cleaned_crimes_df.groupby(["council_district_code","primary_type"],dropna=True)['primary_type'].count()
top_crimes_df = top_crimes_per_district.reset_index(name='count')
for district, group in top_crimes_df.groupby("council_district_code"):
print(f"District {district}:")
print(group[['primary_type', 'count']].sort_values(by='count', ascending=False))
print("-" * 40)
District 1:
primary_type count
6 Theft: All Other Larceny 1109
8 Theft: BOV 905
2 Burglary / \nBreaking & Entering 836
0 Aggravated Assault 301
1 Auto Theft 195
11 Theft: Shoplifting 100
5 Robbery 86
4 Rape 71
10 Theft: Pocket Picking 39
7 Theft: Auto Parts 27
12 Theft: from Building 10
9 Theft: Coin Op Machine 9
3 Homicide: Murder & Nonnegligent Manslaughter 5
----------------------------------------
District 2:
primary_type count
19 Theft: All Other Larceny 927
21 Theft: BOV 888
15 Burglary / \nBreaking & Entering 642
14 Auto Theft 235
13 Aggravated Assault 174
24 Theft: Shoplifting 85
17 Rape 60
18 Robbery 55
20 Theft: Auto Parts 23
23 Theft: Pocket Picking 12
25 Theft: from Building 12
22 Theft: Coin Op Machine 10
16 Homicide: Murder & Nonnegligent Manslaughter 4
----------------------------------------
District 3:
primary_type count
32 Theft: All Other Larceny 1808
34 Theft: BOV 1355
28 Burglary / \nBreaking & Entering 953
38 Theft: Shoplifting 742
27 Auto Theft 330
26 Aggravated Assault 308
31 Robbery 156
30 Rape 96
36 Theft: Pocket Picking 60
33 Theft: Auto Parts 20
35 Theft: Coin Op Machine 16
39 Theft: from Building 12
29 Homicide: Murder & Nonnegligent Manslaughter 2
37 Theft: Purse Snatching 1
----------------------------------------
District 4:
primary_type count
46 Theft: All Other Larceny 1304
48 Theft: BOV 1110
51 Theft: Shoplifting 747
42 Burglary / \nBreaking & Entering 694
41 Auto Theft 361
40 Aggravated Assault 288
45 Robbery 162
44 Rape 106
50 Theft: Pocket Picking 51
47 Theft: Auto Parts 37
49 Theft: Coin Op Machine 29
52 Theft: from Building 17
43 Homicide: Murder & Nonnegligent Manslaughter 3
----------------------------------------
District 5:
primary_type count
58 Theft: All Other Larceny 959
60 Theft: BOV 827
55 Burglary / \nBreaking & Entering 511
54 Auto Theft 171
64 Theft: Shoplifting 142
53 Aggravated Assault 101
56 Rape 36
57 Robbery 29
62 Theft: Pocket Picking 12
65 Theft: from Building 11
59 Theft: Auto Parts 10
61 Theft: Coin Op Machine 10
63 Theft: Purse Snatching 1
----------------------------------------
District 6:
primary_type count
71 Theft: All Other Larceny 490
73 Theft: BOV 443
77 Theft: Shoplifting 304
68 Burglary / \nBreaking & Entering 238
66 Aggravated Assault 61
67 Auto Theft 58
74 Theft: Coin Op Machine 20
72 Theft: Auto Parts 16
69 Rape 15
70 Robbery 13
78 Theft: from Building 8
75 Theft: Pocket Picking 4
76 Theft: Purse Snatching 1
----------------------------------------
District 7:
primary_type count
87 Theft: BOV 1188
85 Theft: All Other Larceny 1152
81 Burglary / \nBreaking & Entering 595
90 Theft: Shoplifting 395
80 Auto Theft 185
79 Aggravated Assault 109
84 Robbery 45
86 Theft: Auto Parts 41
83 Rape 37
91 Theft: from Building 27
88 Theft: Coin Op Machine 19
89 Theft: Pocket Picking 18
82 Homicide: Murder & Nonnegligent Manslaughter 6
----------------------------------------
District 8:
primary_type count
98 Theft: All Other Larceny 631
100 Theft: BOV 526
103 Theft: Shoplifting 286
94 Burglary / \nBreaking & Entering 152
93 Auto Theft 61
92 Aggravated Assault 36
104 Theft: from Building 18
102 Theft: Pocket Picking 17
97 Robbery 14
96 Rape 10
101 Theft: Coin Op Machine 4
99 Theft: Auto Parts 3
95 Homicide: Murder & Nonnegligent Manslaughter 2
----------------------------------------
District 9:
primary_type count
111 Theft: All Other Larceny 2380
113 Theft: BOV 1059
107 Burglary / \nBreaking & Entering 469
117 Theft: Shoplifting 372
106 Auto Theft 171
105 Aggravated Assault 128
115 Theft: Pocket Picking 96
109 Rape 85
118 Theft: from Building 75
110 Robbery 73
114 Theft: Coin Op Machine 8
112 Theft: Auto Parts 7
108 Homicide: Murder & Nonnegligent Manslaughter 3
116 Theft: Purse Snatching 2
----------------------------------------
District 10:
primary_type count
127 Theft: BOV 693
125 Theft: All Other Larceny 534
121 Burglary / \nBreaking & Entering 239
130 Theft: Shoplifting 93
120 Auto Theft 83
119 Aggravated Assault 30
123 Rape 17
126 Theft: Auto Parts 17
124 Robbery 16
128 Theft: Coin Op Machine 13
131 Theft: from Building 12
129 Theft: Pocket Picking 9
122 Homicide: Murder & Nonnegligent Manslaughter 2
----------------------------------------
Surface Level analysis: top_crimes_per_district¶
Top crime in each district is some sort of theft, no visible trend visible
too many elements on the map to load correctly
Maybe we can use a regression technique to predict the number of stations that would have been built based on the top crimes of that district for districts 2,4,6,7 if it is a predictor of stations in a single member district in Austin, Texas. Likely, there is a deeper reason that is not captured by the data collected : ie socio economic make up of residents in the district, presence of businesses/ other points of interest (ie parks, government buildings, proximity to target audience)-> will require more data about population in these districts during the period of crime data)
### Bikeshare trips and stations
# Drop nulls
stations_no_null = stations_all_df.dropna().copy()
trips_no_null = trips_all_df.dropna().copy()
# Ensure the IDs are all of the same type (e.g., str or int)
stations_no_null['station_id'] = stations_no_null['station_id'].astype(str)
trips_no_null['start_station_id'] = trips_no_null['start_station_id'].astype(str)
trips_no_null['end_station_id'] = trips_no_null['end_station_id'].astype(str)
# Join for start station
bikeshare_df = trips_no_null.merge(
stations_no_null.rename(columns={'station_id': 'start_station_id', 'name': 'start_station_name'}),
on='start_station_id',
how='left'
)
# Join for end station
bikeshare_df = bikeshare_df.merge(
stations_no_null.rename(columns={'station_id': 'end_station_id', 'name': 'end_station_name'}),
on='end_station_id',
how='left'
)
bikeshare_df
| trip_id | subscriber_type | bike_id | bike_type | start_time | start_station_id | start_station_name_x | end_station_id | end_station_name_x | duration_minutes | ... | alternate_name_y | city_asset_number_y | property_type_y | number_of_docks_y | power_type_y | footprint_length_y | footprint_width_y | notes_y | council_district_y | modified_date_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 27528286 | Single Trip (Pay-as-you-ride) | 21422 | electric | 2022-08-20 14:03:38+00:00 | 4051 | 10th/Red River | 4051 | 10th/Red River | 102 | ... | NaN | <NA> | NaN | <NA> | NaN | <NA> | NaN | NaN | <NA> | NaT |
| 1 | 28953173 | Student Membership | 21821 | electric | 2023-02-26 02:33:47+00:00 | 4051 | 10th/Red River | 4051 | 10th/Red River | 2 | ... | NaN | <NA> | NaN | <NA> | NaN | <NA> | NaN | NaN | <NA> | NaT |
| 2 | 28833031 | Explorer | 18181 | electric | 2023-02-04 17:52:20+00:00 | 4051 | 10th/Red River | 4051 | 10th/Red River | 4 | ... | NaN | <NA> | NaN | <NA> | NaN | <NA> | NaN | NaN | <NA> | NaT |
| 3 | 28181257 | Local31 | 829 | classic | 2022-10-13 14:42:31+00:00 | 4051 | 10th/Red River | 4051 | 10th/Red River | 13 | ... | NaN | <NA> | NaN | <NA> | NaN | <NA> | NaN | NaN | <NA> | NaT |
| 4 | 28344994 | Explorer | 996 | classic | 2022-10-29 16:01:27+00:00 | 4051 | 10th/Red River | 4051 | 10th/Red River | 129 | ... | NaN | <NA> | NaN | <NA> | NaN | <NA> | NaN | NaN | <NA> | NaT |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2230059 | 7180812 | 24 Hour Walk Up Pass | 133 | classic | 2015-10-11 21:48:03+00:00 | 1006 | Zilker Park West | 1006 | Zilker Park West | 2 | ... | NaN | <NA> | NaN | <NA> | NaN | <NA> | NaN | NaN | <NA> | NaT |
| 2230060 | 7165104 | 24 Hour Walk Up Pass | 765 | classic | 2015-10-10 19:50:35+00:00 | 1006 | Zilker Park West | 1006 | Zilker Park West | 32 | ... | NaN | <NA> | NaN | <NA> | NaN | <NA> | NaN | NaN | <NA> | NaT |
| 2230061 | 7162613 | 24 Hour Walk Up Pass | 922 | classic | 2015-10-10 17:29:40+00:00 | 1006 | Zilker Park West | 1006 | Zilker Park West | 34 | ... | NaN | <NA> | NaN | <NA> | NaN | <NA> | NaN | NaN | <NA> | NaT |
| 2230062 | 7136423 | 24 Hour Walk Up Pass | 898 | classic | 2015-10-09 12:05:41+00:00 | 1006 | Zilker Park West | 1006 | Zilker Park West | 26 | ... | NaN | <NA> | NaN | <NA> | NaN | <NA> | NaN | NaN | <NA> | NaT |
| 2230063 | 7036888 | 24 Hour Walk Up Pass | 559 | classic | 2015-10-03 13:45:45+00:00 | 1006 | Zilker Park West | 1006 | Zilker Park West | 27 | ... | NaN | <NA> | NaN | <NA> | NaN | <NA> | NaN | NaN | <NA> | NaT |
2230064 rows × 38 columns
bikeshare_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2230064 entries, 0 to 2230063 Data columns (total 38 columns): # Column Dtype --- ------ ----- 0 trip_id object 1 subscriber_type object 2 bike_id object 3 bike_type object 4 start_time datetime64[us, UTC] 5 start_station_id object 6 start_station_name_x object 7 end_station_id object 8 end_station_name_x object 9 duration_minutes Int64 10 start_station_name_y object 11 status_x object 12 location_x object 13 address_x object 14 alternate_name_x object 15 city_asset_number_x Int64 16 property_type_x object 17 number_of_docks_x Int64 18 power_type_x object 19 footprint_length_x Int64 20 footprint_width_x float64 21 notes_x object 22 council_district_x Int64 23 modified_date_x datetime64[us, UTC] 24 end_station_name_y object 25 status_y object 26 location_y object 27 address_y object 28 alternate_name_y object 29 city_asset_number_y Int64 30 property_type_y object 31 number_of_docks_y Int64 32 power_type_y object 33 footprint_length_y Int64 34 footprint_width_y float64 35 notes_y object 36 council_district_y Int64 37 modified_date_y datetime64[us, UTC] dtypes: Int64(9), datetime64[us, UTC](3), float64(2), object(24) memory usage: 665.7+ MB
#Clean up this join
cleaned_bikeshare_df= bikeshare_df[['trip_id','bike_id','bike_type','start_time','start_station_id','duration_minutes','location_x','address_x','end_station_id','location_y','address_y']].copy()
# 'start_time' is already a datetime column and 'duration_minutes' is numeric
cleaned_bikeshare_df['end_time'] = cleaned_bikeshare_df['start_time'] + pd.to_timedelta(cleaned_bikeshare_df['duration_minutes'], unit='m')
# Get the list of all columns
cols = cleaned_bikeshare_df.columns.tolist()
# Move 'end_time' to come right after 'start_time'
start_idx = cols.index('start_time')
cols.insert(start_idx + 1, cols.pop(cols.index('end_time')))
# Reorder the DataFrame
cleaned_bikeshare_df = cleaned_bikeshare_df[cols]
cleaned_bikeshare_df
| trip_id | bike_id | bike_type | start_time | end_time | start_station_id | duration_minutes | location_x | address_x | end_station_id | location_y | address_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 27528286 | 21422 | electric | 2022-08-20 14:03:38+00:00 | 2022-08-20 15:45:38+00:00 | 4051 | 102 | NaN | NaN | 4051 | NaN | NaN |
| 1 | 28953173 | 21821 | electric | 2023-02-26 02:33:47+00:00 | 2023-02-26 02:35:47+00:00 | 4051 | 2 | NaN | NaN | 4051 | NaN | NaN |
| 2 | 28833031 | 18181 | electric | 2023-02-04 17:52:20+00:00 | 2023-02-04 17:56:20+00:00 | 4051 | 4 | NaN | NaN | 4051 | NaN | NaN |
| 3 | 28181257 | 829 | classic | 2022-10-13 14:42:31+00:00 | 2022-10-13 14:55:31+00:00 | 4051 | 13 | NaN | NaN | 4051 | NaN | NaN |
| 4 | 28344994 | 996 | classic | 2022-10-29 16:01:27+00:00 | 2022-10-29 18:10:27+00:00 | 4051 | 129 | NaN | NaN | 4051 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2230059 | 7180812 | 133 | classic | 2015-10-11 21:48:03+00:00 | 2015-10-11 21:50:03+00:00 | 1006 | 2 | NaN | NaN | 1006 | NaN | NaN |
| 2230060 | 7165104 | 765 | classic | 2015-10-10 19:50:35+00:00 | 2015-10-10 20:22:35+00:00 | 1006 | 32 | NaN | NaN | 1006 | NaN | NaN |
| 2230061 | 7162613 | 922 | classic | 2015-10-10 17:29:40+00:00 | 2015-10-10 18:03:40+00:00 | 1006 | 34 | NaN | NaN | 1006 | NaN | NaN |
| 2230062 | 7136423 | 898 | classic | 2015-10-09 12:05:41+00:00 | 2015-10-09 12:31:41+00:00 | 1006 | 26 | NaN | NaN | 1006 | NaN | NaN |
| 2230063 | 7036888 | 559 | classic | 2015-10-03 13:45:45+00:00 | 2015-10-03 14:12:45+00:00 | 1006 | 27 | NaN | NaN | 1006 | NaN | NaN |
2230064 rows × 12 columns
# Combine start and end stations into a long format
start_visits = bikeshare_df[['bike_id', 'start_station_id']].rename(columns={'start_station_id': 'station_id'})
end_visits = bikeshare_df[['bike_id', 'end_station_id']].rename(columns={'end_station_id': 'station_id'})
# Combine both into a single DataFrame
all_visits = pd.concat([start_visits, end_visits], axis=0)
# Drop missing station IDs if any
all_visits = all_visits.dropna()
# Count visits per bike_id and station_id
station_counts = all_visits.groupby(['bike_id', 'station_id']).size().reset_index(name='visit_count')
# For each bike_id, get the station_id with the highest visit_count
top_stations = station_counts.sort_values('visit_count', ascending=False).drop_duplicates(subset=['bike_id'])
# Sort by bike_id if needed
top_stations = top_stations.sort_values('bike_id').reset_index(drop=True)
print(top_stations)
bike_id station_id visit_count 0 003 4061 9 1 004G 3798 255 2 005 3798 22 3 009G 3798 177 4 011G 3798 75 ... ... ... ... 1019 Block02 2540 10 1020 Block03 2565 2 1021 Block04 2572 2 1022 Block05 4061 2 1023 Grifford 4938 24 [1024 rows x 3 columns]
# Get the row with the highest visit count (i.e., most visited station by any bike)
most_visited = station_counts.loc[station_counts['visit_count'].idxmax()]
print(most_visited)
bike_id 21650 station_id 3798 visit_count 993 Name: 39570, dtype: object
Final Map Visualisation:¶
# Add station markers to the crime map to see if there is any interesting trends:
try:
folium.GeoJson(cycle_path_data).add_to(crime_district_map)
except Exception as e:
print(e)
for stnIdx in range(0,len(cleaned_stations_df)):
MarkerInfo= pd.DataFrame(data=[[cleaned_stations_df.iloc[stnIdx]['station_id'],
cleaned_stations_df.iloc[stnIdx]['name'],
cleaned_stations_df.iloc[stnIdx]['address'],
cleaned_stations_df.iloc[stnIdx]['modified_date']]],columns=['Station ID','Station Name','Address','Status'])
htmlMarkerInfo=MarkerInfo.to_html()
#location is a string of diff sublength
stnLat,stnLong=cleaned_stations_df.iloc[stnIdx]['location'].split(',')
stnLat=float(stnLat.split('(')[1]) # Split the substring further and cast as a float
stnLong=float(stnLong.split(')')[0]) # Split the substring further and cast as a float
folium.Marker(location=[stnLat,stnLong],popup=folium.Popup(htmlMarkerInfo,max_width="500%")).add_to(crime_district_map)
crime_district_map.save("crime_district_map_with_stations.html")
Open the crime_district_map_with_stations.html document in a new browser tab¶
*In District 8, in Zilker park, along Barton Springs Road, 64 instances of theft across 2 stations in just 2014. Perhaps, the presence of the bikes and the stations attract opportunistic criminals due to the vulnerable users not paying attention to surroundings while interacting with the bike share system?*
Potential Areas for further research in the future:¶
# Investigate correlation between features
# Perform some kind of null hypothesis?
# Save some relevant data into a database?
# Normalise the database
# Build up a compound table
# Perform some kind of data filtering/ aggregation on a view of the data?
# Consider some procedures on steps that may be called together frequently